##               plotly            tidyverse              ggrepel 
##                 TRUE                 TRUE                 TRUE 
##          fastDummies                knitr           kableExtra 
##                 TRUE                 TRUE                 TRUE 
##              splines             reshape2 PerformanceAnalytics 
##                 TRUE                 TRUE                 TRUE 
##          correlation                  see               ggraph 
##                 TRUE                 TRUE                 TRUE 
##                psych              nortest                  rgl 
##                 TRUE                 TRUE                 TRUE 
##                  car               ggside            tidyquant 
##                 TRUE                 TRUE                 TRUE 
##                olsrr               jtools             ggstance 
##                 TRUE                 TRUE                 TRUE 
##               magick              cowplot            emojifont 
##                 TRUE                 TRUE                 TRUE 
##                beepr                 Rcpp         equatiomatic 
##                 TRUE                 TRUE                 TRUE 
##                metan 
##                 TRUE

REGRESSÃO NÃO LINEAR SIMPLES E TRANSFORMAÇÃO DE BOX-COX

Knowing my data

load(file = "bebes.RData")
glimpse(bebes)
## Rows: 74
## Columns: 2
## $ comprimento <dbl> 63.07, 65.63, 65.63, 66.73, 66.37, 66.37, 67.47, 70.40, 68…
## $ idade       <dbl> 19.00, 21.00, 22.50, 22.50, 23.25, 23.50, 25.00, 25.00, 26…
#Descriptive stats
summary(bebes)
##   comprimento        idade      
##  Min.   :31.90   Min.   : 2.15  
##  1st Qu.:56.47   1st Qu.:12.50  
##  Median :70.40   Median :26.00  
##  Mean   :66.92   Mean   :25.31  
##  3rd Qu.:75.81   3rd Qu.:33.00  
##  Max.   :87.63   Max.   :60.00
About this data set

This data set contains 74 observations and 2 variables, the variables are comprimento in cm (height in cm) and idade (age) of babys given in week. The goal of this exercise is create a model to predict the desired comprimento of a baby with certain age. Thus, we want to create a model with idade as predictor variable (X), and comprimento as our dependent variable (Y).

Dispersion Chart

Chart with the compriment according with the idade. The function Ggplotly() construct an interactive chart.

# Dispersion Chart
ggplotly(
  ggplot(bebes, aes(x= idade, y = comprimento))+
    geom_point(color = "darkolivegreen1", alpha = 0.2, size = 2)+ # alpha is the crhonbach alpha
    labs(x = "Age in weeks",
         y = "Height in cm") +
  theme_radar_dark()
)


Visualy is possible to verify that the chart has a non-linear function, in this case logaritmic. However, is necessary to compare with an Non-Linear model.
\n To do so we can use loess as a method (method = “loess”), when creating the geom_smooth. Loess means locally estimated scater smooth (gráfico de dispersão estimado localmente com suavização). It is a type of Polinomial regression.

#Dispersion Chart with linear fit
ggplotly(
  bebes %>%
    ggplot()+
      geom_point(aes(x= idade, y = comprimento), color = "darkolivegreen1", alpha = 0.2, size = 2 )+
        geom_smooth(aes(x= idade, y = comprimento),
                    color = "dodgerblue1",
                    method = "lm",
                    formula = y ~ x, se = F)+ #se = F remove confidence interval
    geom_smooth(aes(x = idade, y = comprimento), 
                color = "#FFBBED",
                method = "loess",
                formula = y ~ x,
                se = F)+
      labs(x = "Age in weeks",
           y = "Height in cm")+
        theme_radar_dark()
    )


From the previous Chart, in blue we have the Linear model and in pink the Non-Linear model. In this example R2loess > R2linear, in this way is possible indication that a Non-Linear model is more adequate to this data set.

Estimating Linear model

model_lin_bebes <- lm(formula = comprimento ~ idade, data = bebes) # y ~ x modelo que nos de a equação y
model_lin_bebes
## 
## Call:
## lm(formula = comprimento ~ idade, data = bebes)
## 
## Coefficients:
## (Intercept)        idade  
##     43.1004       0.9411


For the calculated linear model β = 0.9411 and α = 43.1004.

summary(model_lin_bebes)
## 
## Call:
## lm(formula = comprimento ~ idade, data = bebes)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.2238  -1.6240   0.7136   2.8331   6.1956 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 43.10040    1.03446   41.66   <2e-16 ***
## idade        0.94110    0.03642   25.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.037 on 72 degrees of freedom
## Multiple R-squared:  0.9027, Adjusted R-squared:  0.9013 
## F-statistic: 667.7 on 1 and 72 DF,  p-value: < 2.2e-16


At the descriptive Descriptive statistics we have that β and α are statistically significant, thus H0 is rejected and β is statistically significant for the explanation of the behavior of Y. Note that: The R2 = 0.9013 and is a good value however, a Non-Linear model creation still important to confirm if there will be a better fit.

Testing the Residual`s adeherence to the Normality



The non adherence to the normality of the error terms can indicate that the model was specified incorrectly as to its functional form and there was an omission of relevant explanatory variables. So as to correct this problem, the mathematical formula can be altered, or new explanatory variables could be included in the model.

Exist a range of different tests to be applied to verify the normality adeherence. The most commons are:
Shapiro-Wilk test is appropriate for small samples (those with up to 30 observations),
Shapiro-Francia test is more recommended for large sample.

As we are dealing with a big data (>30 samples) set we choose conduct the Shapiro-Francia test. Where we have the hipotesis:

H0:there is a adeherence to the normality (sample has been generated from a normal distribution)
H1:there is no adeherence to the normality (sample not generated from a normal distribution)

Where differently from other tests H0: p_val≥0,05 adere a normalidade.

sf.test(model_lin_bebes$residuals)
## 
##  Shapiro-Francia normality test
## 
## data:  model_lin_bebes$residuals
## W = 0.9087, p-value = 0.000143


Note that by the shapiro-Francia test, for the linear model model_lin_bebes H0 os rejected, so there is no adeherence to the normality.

Histogram
bebes %>%
  mutate(residuos = model_lin_bebes$residuals) %>%
  ggplot(aes(x = residuos))+
  geom_histogram(aes(y = ..density..), 
                 color = "magenta2", 
                 fill = "violet", 
                 bins = 30,
                 alpha = 0.6) +
  stat_function(fun = dnorm, 
                args = list(mean = mean(model_lin_bebes$residuals),
                            sd = sd(model_lin_bebes$residuals)),
                aes(color = "Curva Normal Teórica"),
                size = 2) +
  scale_color_manual("Legenda:",
                     values = "dodgerblue1") +
  labs(x = "Resíduos",
       y = "Frequência") +
  theme(panel.background = element_rect("white"),
        panel.grid = element_line("lightblue1"),
        panel.border = element_rect(NA),
        legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.


Após rodar o histograma model_lin_bebes é possivel observar as diferenças não são est iguais a 0. Ou seja não são significantes.

Behavior of the residuals as a function of the fitted values for model_lin_bebes, with emphasis on the distributions of the variables


The package ‘ggside’ is required.

bebes %>%
  ggplot(aes(x = model_lin_bebes$fitted.values, y = model_lin_bebes$residuals)) +
  geom_point(color = "gold1", size = 2.5) +
  geom_smooth(aes(color = "Fitted Values"),
              method = "lm", formula = y ~ x, se = F, size = 2) +
  geom_xsidedensity(aes(y = after_stat(density)),
                    alpha = 0.5,
                    size = 1,
                    position = "stack") +
  geom_ysidedensity(aes(x = after_stat(density)),
                    alpha = 0.5,
                    size = 1,
                    position = "stack") +
  xlab("Fitted Values") +
  ylab("Resíduos") +
  scale_color_tq() +
  scale_fill_tq() +
  theme_tq() +
  theme(ggside.panel.scale.x = 0.4,
        ggside.panel.scale.y = 0.4)


Shows the relation between residuals and fitted values. Na lateral esquerda a curva normal.

Box-Cox Transformation

Defined as the transformation of Y in which the residuals have maximize the adeherence to the normality. A Box-Cox transformation can help the researcher in the definition of non linear functional forms.

Calculating lambda


λ varies between –∞ and +∞.

lambda_BC <- powerTransform(bebes$comprimento)
lambda_BC
## Estimated transformation parameter 
## bebes$comprimento 
##          2.659051


λ = 2.659051

Insert λ from Box-Cox at the data set to estimate (Y*) at the data set


Creates Y*

bebes$bc_comp <- (((bebes$comprimento ^ lambda_BC$lambda) - 1) / 
                            lambda_BC$lambda)

Estimates a new model OLS with a dependent variable transformed by Box-Cox


Note that the formula input will not be Y ~ X, it will be Y* ~ X.

model_non_lin_bebes <- lm(formula = bc_comp ~ idade, data = bebes)
model_non_lin_bebes
## 
## Call:
## lm(formula = bc_comp ~ idade, data = bebes)
## 
## Coefficients:
## (Intercept)        idade  
##      4995.2        947.2


For the calculated Non-Linear model β = 4995.2 and α = 947.2.

summary(model_non_lin_bebes)
## 
## Call:
## lm(formula = bc_comp ~ idade, data = bebes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6763.0 -1454.6  -489.5  1503.1  6697.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4995.16     630.25   7.926 2.11e-11 ***
## idade         947.23      22.19  42.689  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2460 on 72 degrees of freedom
## Multiple R-squared:  0.962,  Adjusted R-squared:  0.9615 
## F-statistic:  1822 on 1 and 72 DF,  p-value: < 2.2e-16


At the descriptive Descriptive statistics we have that β and α are statistically significant, thus H0 is rejected and β is statistically significant for the explanation of the behavior of Y. Repare que há um salto na qualidade do ajuste para o modelo não linear (R²).

Comparando os parâmetros do modelo_linear com os do modelo_bc CUIDADO!!! OS PARÂMETROS NÃO SÃO DIRETAMENTE COMPARÁVEIS!

The following formula does not work at the Rmarkdown.

export_summs(model_lin_bebes, model_non_lin_bebes, model.names = c(“Modelo Linear”,“Modelo Box-Cox”), scale = F, digits = 4)

data.frame("R2 OLS" = round(summary(model_lin_bebes)$r.squared, 4),
           "R2 BoxCox" = round(summary(model_non_lin_bebes)$r.squared, 4)) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", position = "center", 
                full_width = F, 
                font_size = 30)
R2.OLS R2.BoxCox
0.9027 0.962

Note that: The R2= 0.9615 for non-linear model and R2 = 0.9013 for the linear model being in this case a better fit for the data set, if it is adequate to the normality adeherence.

Shapiro-Francia to model_non_lin_bebes

Warning: The sf.test() test does not allow us to deal with data sets bigger than 5000 observation, to solve it we can simply crack the code and increase the internal number by Click at the function > press F2

sf.test(model_non_lin_bebes$residuals)
## 
##  Shapiro-Francia normality test
## 
## data:  model_non_lin_bebes$residuals
## W = 0.973, p-value = 0.1026


For model_non_lin_bebes p-value >= 0,05, so H0 is accepted and is possible to afirm that the model adehere to the normality.

Histogram for residuals model_non_lin_bebes

bebes %>%
  mutate(residuos = model_non_lin_bebes$residuals)%>%
  ggplot(aes(x = residuos))+
  geom_histogram(aes(y = ..density..),
                 color ="magenta1" ,
                 fill = "violet", bins = 30, alpha = 0.6 )+
  stat_function(fun = dnorm, 
                args = list(mean = mean(model_non_lin_bebes$residuals),
                            sd = sd(model_non_lin_bebes$residuals)),
                aes(color = "Curva Normal Teórica"),
                size = 2) +
  scale_color_manual("Legenda:",
                     values = "#A43B76") +
  labs(x = "Resíduos",
       y = "Frequência") +
  theme(panel.background = element_rect("white"),
        panel.grid = element_line("lightblue1"),
        panel.border = element_rect(NA),
        legend.position = "bottom")

Behavior of the residuals as a function of the fitted values for model_lin_bebes, with emphasis on the distributions of the variables
bebes %>%
  ggplot(aes(x = model_non_lin_bebes$fitted.values, y = model_non_lin_bebes$residuals)) +
  geom_point(color = "gold", size = 2.5) +
  geom_smooth(aes(color = "Fitted Values"),
              method = "lm", formula = y ~ x, se = F, size = 2) +
  geom_xsidedensity(aes(y = after_stat(density)),
                    alpha = 0.5,
                    size = 1,
                    position = "stack") +
  geom_ysidedensity(aes(x = after_stat(density)),
                    alpha = 0.5,
                    size = 1,
                    position = "stack") +
  xlab("Fitted Values") +
  ylab("Resíduos") +
  scale_color_tq() +
  scale_fill_tq() +
  theme_tq() +
  theme(ggside.panel.scale.x = 0.4,
        ggside.panel.scale.y = 0.4)

Making predictions with models Linear and Non Linear

What is the expected height for a baby 52 weeks old?

####For linear model

predict(object = model_lin_bebes,
        data.frame(idade = 52),
        interval = "confidence", level = 0.95)
##        fit      lwr      upr
## 1 92.03749 89.88585 94.18912

####For Non linear model

predict(object = model_non_lin_bebes,
        data.frame(idade = 52),
        interval = "confidence",
        level = 0.95)
##        fit      lwr      upr
## 1 54251.12 52940.22 55562.03


This result is Y*, we need convert the value for Y to find the answer.

Cálculus to obtain fitted values Y
#Não podemos nos esquecer de fazer o cálculo para a obtenção do fitted value de Y (variável 'comprimento')
(((54251.12   * 2.659051) + 1)) ^ (1 / 2.659051)
## [1] 87.14007

Save the fitted values for both models

bebes$yhat_linear <- model_lin_bebes$fitted.values
bebes$yhat_model_non_lin_bebes <- (((model_non_lin_bebes$fitted.values*(lambda_BC$lambda))+
                                    1))^(1/(lambda_BC$lambda))
bebes%>% # For visualization
  select(idade,comprimento,yhat_linear,yhat_model_non_lin_bebes) %>%
  kable()%>%
  kable_styling(bootstrap_options = "striped", font_size = 20, full_width = F)
idade comprimento yhat_linear yhat_model_non_lin_bebes
19.00 63.07 60.98126 63.09730
21.00 65.63 62.86346 65.00430
22.50 65.63 64.27510 66.37586
22.50 66.73 64.27510 66.37586
23.25 66.37 64.98093 67.04434
23.50 66.37 65.21620 67.26472
25.00 67.47 66.62785 68.56259
25.00 70.40 66.62785 68.56259
26.00 68.57 67.56894 69.40568
26.75 71.13 68.27477 70.02702
27.00 69.67 68.51004 70.23211
27.50 71.50 68.98059 70.63934
28.00 70.40 69.45114 71.04271
28.00 73.70 69.45114 71.04271
30.50 72.97 71.80388 73.00487
31.50 73.33 72.74498 73.76574
32.50 74.07 73.68608 74.51381
33.00 75.53 74.15663 74.88322
33.00 78.10 74.15663 74.88322
33.50 74.07 74.62718 75.24963
35.00 74.43 76.03882 76.33147
37.00 77.00 77.92102 77.73552
37.50 77.00 78.39157 78.08002
38.50 78.83 79.33267 78.76157
39.00 77.37 79.80322 79.09871
39.00 79.57 79.80322 79.09871
42.00 81.40 82.62651 81.07325
43.00 81.03 83.56761 81.71400
45.50 81.77 85.92035 83.28045
45.50 83.23 85.92035 83.28045
48.00 81.03 88.27309 84.79949
49.00 84.33 89.21419 85.39464
53.00 84.70 92.97858 87.70916
60.00 87.63 99.56627 91.53189
2.15 31.90 45.12376 40.41261
2.60 33.37 45.54726 41.31699
7.35 44.73 50.01747 49.34301
7.35 46.57 50.01747 49.34301
7.80 47.67 50.44097 49.99728
7.85 47.30 50.48802 50.06911
7.90 47.30 50.53507 50.14076
8.00 46.93 50.62918 50.28356
8.00 46.93 50.62918 50.28356
8.15 48.77 50.77035 50.49650
8.80 50.23 51.38206 51.40251
9.70 50.60 52.22905 52.61486
9.70 52.43 52.22905 52.61486
9.75 52.07 52.27611 52.68087
9.85 50.97 52.37022 52.81246
9.90 51.33 52.41727 52.87806
10.25 51.70 52.74665 53.33349
10.25 52.43 52.74665 53.33349
11.50 55.37 53.92303 54.90964
15.50 59.77 57.68742 59.50864
17.00 65.27 59.09906 61.08955
18.75 64.17 60.74599 62.85212
21.00 67.10 62.86346 65.00430
22.25 68.20 64.03983 66.15053
22.50 68.93 64.27510 66.37586
23.00 69.30 64.74565 66.82274
24.00 70.40 65.68675 67.70194
26.00 66.73 67.56894 69.40568
26.00 70.77 67.56894 69.40568
26.25 72.23 67.80422 69.61381
28.50 72.23 69.92169 71.44232
29.50 74.80 70.86279 72.23061
29.50 75.17 70.86279 72.23061
30.50 74.43 71.80388 73.00487
30.50 75.53 71.80388 73.00487
30.50 75.90 71.80388 73.00487
31.00 78.47 72.27443 73.38694
36.50 80.67 77.45047 77.38846
38.00 81.03 78.86212 78.42203
38.50 78.47 79.33267 78.76157

Models Adjust (fitted values vs. Real values)

bebes %>%
  ggplot()+
  geom_smooth(aes(x = comprimento, y = yhat_linear, color = "OLS Linear"),#line for linear
              method = "lm", se = F, formula = y ~ splines::bs(x, df = 5),
              size = 1.5)+
  geom_point(aes(x = comprimento, y = yhat_linear),#points captured for linear
             color = "#FFBBED", alpha = 0.6, size = 1) +
  geom_smooth(aes(x = comprimento, y = yhat_model_non_lin_bebes, color = "Non linear with Box-Cox"),#line for non linear
              method = "lm", se = F, formula = y ~ splines::bs(x, df = 5),
              size = 1.5) +
  geom_point(aes(x = comprimento, y = yhat_model_non_lin_bebes), #points captured for non linear
             color = "dodgerblue1", alpha = 0.6, size = 1) +
  geom_smooth(aes(x = comprimento, y = comprimento), method = "lm", 
              color = "gray30", size = 1.05,
              linetype = "longdash") +
  scale_color_manual("Modelos:", 
                     values = c("aquamarine", "orchid")) +
  labs(x = "Comprimento", y = "Fitted Values") +
  theme(panel.background = element_rect("white"),
      panel.grid = element_line("grey95"),
      panel.border = element_rect(NA),
      legend.position = "bottom")
## `geom_smooth()` using formula = 'y ~ x'


To compare models is always need to do fitted values vs. actual values. Note that, in this data set, the extreme values are what make the non-linear model have an better fit.

Multiple non Linear Regression

load(file = "empresas.RData")
glimpse(empresas)
## Rows: 124
## Columns: 6
## $ empresa       <chr> "Adidas", "BASF", "Bayer", "BSH", "Bosch", "Daimler AG",…
## $ retorno       <dbl> 63.67, 55.49, 52.42, 54.00, 68.01, 64.13, 68.76, 79.02, …
## $ disclosure    <dbl> 83, 91, 77, 90, 93, 90, 89, 90, 98, 58, 74, 81, 46, 39, …
## $ endividamento <dbl> 1.3, 33.9, 36.1, 20.8, 14.1, 45.2, 31.6, 24.9, 31.5, 15.…
## $ ativos        <dbl> 3967, 5450, 4327, 4109, 4458, 6498, 4993, 6352, 9010, 31…
## $ liquidez      <dbl> 14.3, 16.8, 15.5, 16.8, 17.0, 15.8, 16.0, 16.5, 17.0, 15…
summary(empresas)
##    empresa             retorno        disclosure    endividamento  
##  Length:124         Min.   :21.03   Min.   : 6.00   Min.   : 1.20  
##  Class :character   1st Qu.:33.84   1st Qu.:18.00   1st Qu.:18.52  
##  Mode  :character   Median :44.17   Median :41.00   Median :24.95  
##                     Mean   :46.20   Mean   :49.68   Mean   :27.73  
##                     3rd Qu.:58.20   3rd Qu.:83.00   3rd Qu.:34.12  
##                     Max.   :86.18   Max.   :98.00   Max.   :64.90  
##      ativos        liquidez   
##  Min.   :1851   Min.   : 7.9  
##  1st Qu.:2597   1st Qu.: 9.3  
##  Median :3476   Median :12.2  
##  Mean   :3739   Mean   :12.3  
##  3rd Qu.:4458   3rd Qu.:15.8  
##  Max.   :9010   Max.   :17.0


About my data set:
composed by Retorno de papeis das empresas , indicadores contábeis e financeiros.

Mais negociada o retorno na janela de tempo, em função de uma var X1 chamada disclosure(nota da agencia de classificação, maior maior o grau de transparencia), X2 endividadamento(percentual em relaçao ao patrimonio liq) , X3 posição dos ativos vezes 1milhão (em dólares), liquidez(do balanço patrimonial relaçao entre passivo;ativo circulante).
#### Estuding the correlations

empresas %>%
  corr_plot(retorno, disclosure, endividamento, ativos, liquidez,
            shape.point = 21,
            col.point = "black",
            fill.point = "#FDE725FF",
            size.point = 2,
            alpha.point = 0.6,
            maxsize = 4,
            minsize = 2,
            smooth = TRUE,
            col.smooth = "black",
            col.sign = "red",
            upper = "corr",
            lower = "scatter",
            diag.type = "density",
            col.diag = "orchid",
            pan.spacing = 0,
            lab.position = "bl")


Gráfico de Correlação : Mostra que a magnititue da correlação (a significancia em relaçao a magnit) em funçao do tamanho da amostra pode fazer com que a variável entre no modelo mas dependendo da correlaçao entre variáveis X, pode fazer com que a variavel seja excluida (multicolinearidade e heterocedasticidade).


The values that displays “*” are statistically significant. Se for estisticamente significante fundo vermelho se não for fundo branco para.

Estimating a multiple model

empresas %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = F, 
                font_size = 22)
empresa retorno disclosure endividamento ativos liquidez
Adidas 63.67 83 1.3 3967 14.3
BASF 55.49 91 33.9 5450 16.8
Bayer 52.42 77 36.1 4327 15.5
BSH 54.00 90 20.8 4109 16.8
Bosch 68.01 93 14.1 4458 17.0
Daimler AG 64.13 90 45.2 6498 15.8
Deutsche Bank 68.76 89 31.6 4993 16.0
Faber-Castell 79.02 90 24.9 6352 16.5
Group Technologies 84.64 98 31.5 9010 17.0
Lufthansa 59.84 58 15.2 3189 15.8
Puma 50.79 74 17.1 3476 16.0
Siemens 47.46 81 64.9 4700 12.2
ThyssenKrupp 36.06 46 47.3 3785 11.0
Volkswagen 34.21 39 22.3 2901 12.6
Opel 47.31 40 33.4 3872 12.6
Deutsche Telekom 46.98 17 23.6 2597 12.2
Wurth 34.54 54 2.4 3477 11.2
DHL Express 38.82 27 25.0 2730 9.3
BHP Billton 30.82 30 28.3 3390 9.2
Holden 38.03 11 13.8 1851 10.4
Qantas 31.92 18 19.0 3369 9.2
Telstra 29.02 13 16.4 2509 9.5
Bunge 31.04 20 30.4 2268 8.7
Fortis 21.47 6 22.5 1938 9.8
InBev 32.70 16 45.4 2239 8.6
Stella Artois 32.61 7 19.6 1872 7.9
Alpargatas 40.16 41 1.2 2996 10.5
AmBev 42.32 35 16.8 3622 12.3
Agrale 37.17 24 16.1 2972 9.7
Aracruz Celulose 48.57 68 34.8 4691 8.9
Azálea 33.66 14 40.0 2303 8.0
Banco Itaú 56.14 83 23.9 3967 14.3
Bematech 61.71 91 33.9 5450 16.8
Bob’s 54.43 77 36.1 4327 15.5
Bradesco 71.84 90 20.8 4109 16.8
Brahma 65.68 93 14.1 4458 17.0
Caloi 72.57 90 45.2 6498 15.8
CSN 62.36 89 31.6 4993 16.0
Embraco 71.09 90 24.9 6352 16.5
Embraer 86.18 98 31.5 9010 17.0
EMS 60.90 58 15.2 3189 15.8
Guararapes 56.33 74 17.1 3476 16.0
Gerdau 44.19 81 2.4 4700 12.2
Globo 44.62 46 47.3 3785 11.0
Gradiente 38.49 39 22.3 2901 12.6
Grupo Edson Queiroz 42.04 40 33.4 3872 12.6
H. Stern 29.53 17 23.6 2597 12.2
Klabin 51.60 54 31.7 3477 11.2
Marcopolo 30.57 27 25.0 2730 9.3
Natura 37.93 30 28.3 3390 9.2
O Boticário 38.25 11 13.8 1851 10.4
Odebrecht 27.52 18 19.0 3369 9.2
Perdigão 39.04 13 37.8 2509 9.5
Petrobras 37.86 20 30.4 2268 8.7
Randon 29.95 6 22.5 1938 9.8
Record 28.46 16 45.4 2239 8.6
Sadia 21.88 7 19.6 1872 7.9
Itaú Unibanco Banco 37.94 41 49.5 2996 10.5
Tramontina 44.96 35 16.8 3622 12.3
Tecnologia Automotiv 46.96 24 16.1 2972 9.7
Vale 36.38 68 34.8 4691 8.9
Votorantim 29.80 14 40.0 2303 8.0
WEG 61.55 83 23.9 3967 14.3
ATI Technologies 70.02 91 33.9 5450 16.8
Bombardier 60.61 77 36.1 4327 15.5
Celestica 52.27 90 20.8 4109 16.8
Cogeco 69.97 93 14.1 4458 17.0
Cognos 57.66 90 45.2 6498 15.8
Nortel 63.92 89 31.6 4993 16.0
Lenovo 60.61 90 24.9 6352 16.5
Sinopec 81.56 98 31.5 9010 17.0
Daewoo 46.41 58 15.2 3189 15.8
Hyundai 63.13 74 17.1 3476 16.0
Kia 37.25 81 64.9 4700 12.2
LG 46.30 46 47.3 3785 11.0
Samsung 45.49 39 22.3 2901 12.6
Ssangyong 36.42 40 33.4 3872 12.6
Lego 31.09 17 23.6 2597 12.2
Maersk 49.28 54 31.7 3477 11.2
Novozymes 33.01 27 25.0 2730 9.3
Atento 36.01 30 28.3 3390 9.2
BBVA 32.14 11 13.8 1851 10.4
Gamesa 38.91 18 19.0 3369 9.2
Iberia Linhas Aéreas 24.89 13 16.4 2509 9.5
Iberdrola 32.19 20 30.4 2268 8.7
Inditex 25.58 6 22.5 1938 9.8
Mapfre 35.26 16 45.4 2239 8.6
Santander 29.29 7 19.6 1872 7.9
Telefónica 40.15 41 49.5 2996 10.5
Zara 55.74 35 16.8 3622 12.3
Repsol 47.02 24 16.1 2972 9.7
3M 46.13 68 34.8 4691 8.9
Altria 32.41 14 40.0 2303 8.0
American Express 54.74 83 23.9 3967 14.3
American Airlines 61.72 91 33.9 5450 16.8
AMD 65.44 77 36.1 4327 15.5
AOL 69.77 90 20.8 4109 16.8
Apple 60.20 93 14.1 4458 17.0
AT&T 64.32 90 45.2 6498 15.8
Avon 69.95 89 31.6 4993 16.0
BankBoston 60.26 90 24.9 6352 16.5
Bank of America 72.85 98 31.5 9010 17.0
BellSouth 44.28 58 15.2 3189 15.8
Blockbuster 48.15 74 17.1 3476 16.0
Boeing 42.14 81 64.9 4700 12.2
Burger King 30.55 46 47.3 3785 11.0
Caterpillar 35.07 39 22.3 2901 12.6
ChevronTexaco 41.78 40 33.4 3872 12.6
Chrysler 43.50 17 23.6 2597 12.2
Cinemark 33.64 54 31.7 3477 11.2
Cisco Systems 26.29 27 25.0 2730 9.3
Colgate-Palmolive 45.00 30 28.3 3390 9.2
Citigroup 35.31 11 13.8 1851 10.4
Coca-Cola 29.11 18 19.0 3369 9.2
CSC 40.27 13 16.4 2509 9.5
Dell 29.75 20 30.4 2268 8.7
Delta Air Lines 26.19 6 22.5 1938 9.8
Deloitte 33.90 16 45.4 2239 8.6
Dow Corning 21.03 7 19.6 1872 7.9
DuPont 39.85 41 49.5 2996 10.5
ExxonMobil 44.16 35 16.8 3622 12.3
FedEx 45.38 24 16.1 2972 9.7
Ford Motors 45.25 68 34.8 4691 8.9
General Electric 33.28 14 40.0 2303 8.0
m_multi_empresas <- lm(formula = retorno ~ . - empresa,
                       data = empresas )
summary(m_multi_empresas)
## 
## Call:
## lm(formula = retorno ~ . - empresa, data = empresas)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1494  -5.3099   0.5667   5.2917  10.8992 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.0506242  4.0797521   1.483   0.1407    
## disclosure     0.1067127  0.0479210   2.227   0.0278 *  
## endividamento -0.0881867  0.0511828  -1.723   0.0875 .  
## ativos         0.0034722  0.0006763   5.134 1.12e-06 ***
## liquidez       1.9761624  0.3963012   4.987 2.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.272 on 119 degrees of freedom
## Multiple R-squared:  0.8325, Adjusted R-squared:  0.8269 
## F-statistic: 147.9 on 4 and 119 DF,  p-value: < 2.2e-16


Note that β_endividamento is not significant at an confidence level of 5%.

m_multi_empresas
## 
## Call:
## lm(formula = retorno ~ . - empresa, data = empresas)
## 
## Coefficients:
##   (Intercept)     disclosure  endividamento         ativos       liquidez  
##      6.050624       0.106713      -0.088187       0.003472       1.976162


By the linear model m_multi_empresas, all variables are included even β_endividamento that is not significant at an confidence level of 95%. Whats is not appropriate. So is needed to do an STEPWISE PROCEDURE.

STEPWISE PROCEDURE


K is the given by Chi-Square at 95%, where df = degree of freedom and lower.tail as a logical parameter that if TRUE (default), probabilities are P[X x]P[X≤x], otherwise, P[X > x]P[X>x]. K gives the best configuration, for multivariated models, at certain confidence level.

#De onde vem o argumento k = 3.841459?
qchisq(p = 0.05, df = 1, lower.tail = F)
## [1] 3.841459
round(pchisq(3.841459, df = 1, lower.tail = F), 7)
## [1] 0.05
step_empresas <- step(m_multi_empresas, k= 3.841459)
## Start:  AIC=469.46
## retorno ~ (empresa + disclosure + endividamento + ativos + liquidez) - 
##     empresa
## 
##                 Df Sum of Sq    RSS    AIC
## - endividamento  1    116.78 4798.1 468.67
## <none>                       4681.3 469.46
## - disclosure     1    195.07 4876.4 470.68
## - liquidez       1    978.17 5659.5 489.15
## - ativos         1   1036.87 5718.2 490.43
## 
## Step:  AIC=468.67
## retorno ~ disclosure + ativos + liquidez
## 
##              Df Sum of Sq    RSS    AIC
## - disclosure  1    138.53 4936.6 468.36
## <none>                    4798.1 468.67
## - ativos      1    940.64 5738.7 487.03
## - liquidez    1   1513.50 6311.6 498.83
## 
## Step:  AIC=468.36
## retorno ~ ativos + liquidez
## 
##            Df Sum of Sq    RSS    AIC
## <none>                  4936.6 468.36
## - ativos    1    2387.1 7323.8 513.43
## - liquidez  1    4616.2 9552.9 546.38
summary(step_empresas)
## 
## Call:
## lm(formula = retorno ~ ativos + liquidez, data = empresas)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.537  -5.533   1.108   4.930  11.831 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.5347536  2.3408742  -1.083    0.281    
## ativos       0.0040223  0.0005258   7.649  5.4e-12 ***
## liquidez     2.7390775  0.2575030  10.637  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.387 on 121 degrees of freedom
## Multiple R-squared:  0.8234, Adjusted R-squared:  0.8205 
## F-statistic: 282.1 on 2 and 121 DF,  p-value: < 2.2e-16


for the Linear model the stepwise removed endividamento and disclosure.

comparing models

export_summs(step_empresas, scale = F, digits = 5)

#Parâmetros reais do modelo com procedimento Stepwise

confint(step_empresas, level = 0.95)
##                    2.5 %      97.5 %
## (Intercept) -7.169131638 2.099624378
## ativos       0.002981241 0.005063329
## liquidez     2.229282413 3.248872510
plot_summs(step_empresas, colors = "#440154FF")
## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools
## Loading required namespace: broom.mixed


Range β variates a lot between ativos and liquidez, this turns difficult to compare results so is needed to standardize the vars.

Standardize Parameters

plot_summs(step_empresas, scale = TRUE, colors = "#440154FF")


From this chart is possible note that liquidez has a bigger relative importance to explain the behavior of retorno.

Comparando ICs dos betas dos modelos sem e com procedimento Stepwise

plot_summs(m_multi_empresas, step_empresas, scale = TRUE, plot.distributions = TRUE,
           inner_ci_level = .95, colors = c("#FDE725FF", "#440154FF"))
## inner_ci_level is ignored when plot.distributions == TRUE and more than one
## model is used.


The model with stepwise remains just 2 variables.

Adeherence to normality SHAPIRO-FRANCIA

sf.test(step_empresas$residuals)
## 
##  Shapiro-Francia normality test
## 
## data:  step_empresas$residuals
## W = 0.97387, p-value = 0.01816


For model_non_lin_bebes p-value >= 0,05, so H0 is accepted and is possible to afirm that the model adehere to the normality.

Histogram
empresas %>%
  mutate(residuos = step_empresas$residuals) %>%
  ggplot(aes(x = residuos)) +
  geom_histogram(aes(y = ..density..), 
                 color = "white", 
                 fill = "aquamarine", 
                 bins = 30,
                 alpha = 0.6) +
  stat_function(fun = dnorm, 
                args = list(mean = mean(step_empresas$residuals),
                            sd = sd(step_empresas$residuals)),
                size = 2, color = "grey30") +
    scale_color_manual(values = "grey50") +
    labs(x = "Resíduos",
         y = "Frequência") +
  theme_bw()

Box-Cox Transformation

Calculating lambda
lambda_emp <- powerTransform(empresas$retorno) #input original data set $ y
lambda_emp
## Estimated transformation parameter 
## empresas$retorno 
##      -0.02256414


λ = -0.02256414

Insert λ from Box-Cox at the data set to estimate (Y*) at the data set

empresas$bcretorno <- (((empresas$retorno ^ lambda_emp$lambda)-1)/
                         (lambda_emp$lambda))
empresas %>%
  select(empresa, retorno, bcretorno, everything()) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = F, 
                font_size = 18)
empresa retorno bcretorno disclosure endividamento ativos liquidez
Adidas 63.67 3.965002 83 1.3 3967 14.3
BASF 55.49 3.839599 91 33.9 5450 16.8
Bayer 52.42 3.787582 77 36.1 4327 15.5
BSH 54.00 3.814731 90 20.8 4109 16.8
Bosch 68.01 4.024999 93 14.1 4458 17.0
Daimler AG 64.13 3.971556 90 45.2 6498 15.8
Deutsche Bank 68.76 4.034969 89 31.6 4993 16.0
Faber-Castell 79.02 4.161187 90 24.9 6352 16.5
Group Technologies 84.64 4.223394 98 31.5 9010 17.0
Lufthansa 59.84 3.908473 58 15.2 3189 15.8
Puma 50.79 3.758683 74 17.1 3476 16.0
Siemens 47.46 3.696574 81 64.9 4700 12.2
ThyssenKrupp 36.06 3.444002 46 47.3 3785 11.0
Volkswagen 34.21 3.395400 39 22.3 2901 12.6
Opel 47.31 3.693673 40 33.4 3872 12.6
Deutsche Telekom 46.98 3.687256 17 23.6 2597 12.2
Wurth 34.54 3.404263 54 2.4 3477 11.2
DHL Express 38.82 3.511966 27 25.0 2730 9.3
BHP Billton 30.82 3.298927 30 28.3 3390 9.2
Holden 38.03 3.493030 11 13.8 1851 10.4
Qantas 31.92 3.331373 18 19.0 3369 9.2
Telstra 29.02 3.243190 13 16.4 2509 9.5
Bunge 31.04 3.305510 20 30.4 2268 8.7
Fortis 21.47 2.962961 6 22.5 1938 9.8
InBev 32.70 3.353694 16 45.4 2239 8.6
Stella Artois 32.61 3.351147 7 19.6 1872 7.9
Alpargatas 40.16 3.543200 41 1.2 2996 10.5
AmBev 42.32 3.591372 35 16.8 3622 12.3
Agrale 37.17 3.471954 24 16.1 2972 9.7
Aracruz Celulose 48.57 3.717759 68 34.8 4691 8.9
Azálea 33.66 3.380431 14 40.0 2303 8.0
Banco Itaú 56.14 3.850235 83 23.9 3967 14.3
Bematech 61.71 3.936521 91 33.9 5450 16.8
Bob’s 54.43 3.821979 77 36.1 4327 15.5
Bradesco 71.84 4.074779 90 20.8 4109 16.8
Brahma 65.68 3.993292 93 14.1 4458 17.0
Caloi 72.57 4.083958 90 45.2 6498 15.8
CSN 62.36 3.946068 89 31.6 4993 16.0
Embraco 71.09 4.065248 90 24.9 6352 16.5
Embraer 86.18 4.239703 98 31.5 9010 17.0
EMS 60.90 3.924480 58 15.2 3189 15.8
Guararapes 56.33 3.853320 74 17.1 3476 16.0
Gerdau 44.19 3.631087 81 2.4 4700 12.2
Globo 44.62 3.639977 46 47.3 3785 11.0
Gradiente 38.49 3.504104 39 22.3 2901 12.6
Grupo Edson Queiroz 42.04 3.585271 40 33.4 3872 12.6
H. Stern 29.53 3.259334 17 23.6 2597 12.2
Klabin 51.60 3.773160 54 31.7 3477 11.2
Marcopolo 30.57 3.291388 27 25.0 2730 9.3
Natura 37.93 3.490605 30 28.3 3390 9.2
O Boticário 38.25 3.498344 11 13.8 1851 10.4
Odebrecht 27.52 3.193972 18 19.0 3369 9.2
Perdigão 39.04 3.517169 13 37.8 2509 9.5
Petrobras 37.86 3.488903 20 30.4 2268 8.7
Randon 29.95 3.272416 6 22.5 1938 9.8
Record 28.46 3.225127 16 45.4 2239 8.6
Sadia 21.88 2.980609 7 19.6 1872 7.9
Itaú Unibanco Banco 37.94 3.490848 41 49.5 2996 10.5
Tramontina 44.96 3.646943 35 16.8 3622 12.3
Tecnologia Automotiv 46.96 3.686865 24 16.1 2972 9.7
Vale 36.38 3.452150 68 34.8 4691 8.9
Votorantim 29.80 3.267765 14 40.0 2303 8.0
WEG 61.55 3.934156 83 23.9 3967 14.3
ATI Technologies 70.02 4.051471 91 33.9 5450 16.8
Bombardier 60.61 3.920130 77 36.1 4327 15.5
Celestica 52.27 3.784961 90 20.8 4109 16.8
Cogeco 69.97 4.050822 93 14.1 4458 17.0
Cognos 57.66 3.874621 90 45.2 6498 15.8
Nortel 63.92 3.968570 89 31.6 4993 16.0
Lenovo 60.61 3.920130 90 24.9 6352 16.5
Sinopec 81.56 4.189844 98 31.5 9010 17.0
Daewoo 46.41 3.676063 58 15.2 3189 15.8
Hyundai 63.13 3.957245 74 17.1 3476 16.0
Kia 37.25 3.473936 81 64.9 4700 12.2
LG 46.30 3.673887 46 47.3 3785 11.0
Samsung 45.49 3.657697 39 22.3 2901 12.6
Ssangyong 36.42 3.453163 40 33.4 3872 12.6
Lego 31.09 3.306999 17 23.6 2597 12.2
Maersk 49.28 3.731052 54 31.7 3477 11.2
Novozymes 33.01 3.362415 27 25.0 2730 9.3
Atento 36.01 3.442722 30 28.3 3390 9.2
BBVA 32.14 3.337725 11 13.8 1851 10.4
Gamesa 38.91 3.514098 18 19.0 3369 9.2
Iberia Linhas Aéreas 24.89 3.100659 13 16.4 2509 9.5
Iberdrola 32.19 3.339162 20 30.4 2268 8.7
Inditex 25.58 3.126083 6 22.5 1938 9.8
Mapfre 35.26 3.423305 16 45.4 2239 8.6
Santander 29.29 3.251773 7 19.6 1872 7.9
Telefónica 40.15 3.542971 41 49.5 2996 10.5
Zara 55.74 3.843705 35 16.8 3622 12.3
Repsol 47.02 3.688036 24 16.1 2972 9.7
3M 46.13 3.670513 68 34.8 4691 8.9
Altria 32.41 3.345460 14 40.0 2303 8.0
American Express 54.74 3.827168 83 23.9 3967 14.3
American Airlines 61.72 3.936669 91 33.9 5450 16.8
AMD 65.44 3.989961 77 36.1 4327 15.5
AOL 69.77 4.048221 90 20.8 4109 16.8
Apple 60.20 3.913942 93 14.1 4458 17.0
AT&T 64.32 3.974249 90 45.2 6498 15.8
Avon 69.95 4.050562 89 31.6 4993 16.0
BankBoston 60.26 3.914850 90 24.9 6352 16.5
Bank of America 72.85 4.087454 98 31.5 9010 17.0
BellSouth 44.28 3.632955 58 15.2 3189 15.8
Blockbuster 48.15 3.709802 74 17.1 3476 16.0
Boeing 42.14 3.587455 81 64.9 4700 12.2
Burger King 30.55 3.290782 46 47.3 3785 11.0
Caterpillar 35.07 3.418319 39 22.3 2901 12.6
ChevronTexaco 41.78 3.579569 40 33.4 3872 12.6
Chrysler 43.50 3.616636 17 23.6 2597 12.2
Cinemark 33.64 3.379882 54 31.7 3477 11.2
Cisco Systems 26.29 3.151521 27 25.0 2730 9.3
Colgate-Palmolive 45.00 3.647760 30 28.3 3390 9.2
Citigroup 35.31 3.424613 11 13.8 1851 10.4
Coca-Cola 29.11 3.246060 18 19.0 3369 9.2
CSC 40.27 3.545717 13 16.4 2509 9.5
Dell 29.75 3.266210 20 30.4 2268 8.7
Delta Air Lines 26.19 3.147981 6 22.5 1938 9.8
Deloitte 33.90 3.386993 16 45.4 2239 8.6
Dow Corning 21.03 2.943634 7 19.6 1872 7.9
DuPont 39.85 3.536070 41 49.5 2996 10.5
ExxonMobil 44.16 3.630464 35 16.8 3622 12.3
FedEx 45.38 3.655476 24 16.1 2972 9.7
Ford Motors 45.25 3.652843 68 34.8 4691 8.9
General Electric 33.28 3.369942 14 40.0 2303 8.0

Estimates a new model

model_bc <- lm(formula= bcretorno ~ . -empresa - retorno, data = empresas)
model_bc
## 
## Call:
## lm(formula = bcretorno ~ . - empresa - retorno, data = empresas)
## 
## Coefficients:
##   (Intercept)     disclosure  endividamento         ativos       liquidez  
##     2.884e+00      3.404e-03     -1.251e-03      4.331e-05      3.594e-02
summary(model_bc)
## 
## Call:
## lm(formula = bcretorno ~ . - empresa - retorno, data = empresas)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34987 -0.10651  0.01319  0.10093  0.26474 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.884e+00  8.807e-02  32.752  < 2e-16 ***
## disclosure     3.404e-03  1.034e-03   3.291  0.00132 ** 
## endividamento -1.251e-03  1.105e-03  -1.133  0.25966    
## ativos         4.331e-05  1.460e-05   2.967  0.00364 ** 
## liquidez       3.594e-02  8.555e-03   4.202 5.15e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1354 on 119 degrees of freedom
## Multiple R-squared:  0.8012, Adjusted R-squared:  0.7945 
## F-statistic: 119.9 on 4 and 119 DF,  p-value: < 2.2e-16

#Stepwise for new model

step_model_bc <- step(model_bc, k = 3.841459)
## Start:  AIC=-481.79
## bcretorno ~ (empresa + retorno + disclosure + endividamento + 
##     ativos + liquidez) - empresa - retorno
## 
##                 Df Sum of Sq    RSS     AIC
## - endividamento  1   0.02351 2.2049 -484.30
## <none>                       2.1814 -481.79
## - ativos         1   0.16135 2.3427 -476.79
## - disclosure     1   0.19850 2.3799 -474.83
## - liquidez       1   0.32362 2.5050 -468.48
## 
## Step:  AIC=-484.3
## bcretorno ~ disclosure + ativos + liquidez
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    2.2049 -484.30
## - ativos      1   0.14353 2.3484 -480.33
## - disclosure  1   0.17756 2.3825 -478.54
## - liquidez    1   0.47423 2.6791 -463.99
summary(step_model_bc)
## 
## Call:
## lm(formula = bcretorno ~ disclosure + ativos + liquidez, data = empresas)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35140 -0.10168  0.01814  0.09309  0.27990 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.828e+00  7.246e-02  39.019  < 2e-16 ***
## disclosure  3.131e-03  1.007e-03   3.109  0.00235 ** 
## ativos      4.005e-05  1.433e-05   2.795  0.00605 ** 
## liquidez    3.984e-02  7.842e-03   5.080  1.4e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1356 on 120 degrees of freedom
## Multiple R-squared:  0.799,  Adjusted R-squared:  0.794 
## F-statistic:   159 on 3 and 120 DF,  p-value: < 2.2e-16
#Note que a variável 'disclosure' acaba voltando ao modelo na forma funcional não linear!

Shapiro-Francia step_modelo_bc

sf.test(step_model_bc$residuals)
## 
##  Shapiro-Francia normality test
## 
## data:  step_model_bc$residuals
## W = 0.98705, p-value = 0.2461


Adeherence to the normality

Histogram
empresas %>%
    mutate(residuos = step_model_bc$residuals) %>%
    ggplot(aes(x = residuos)) +
    geom_histogram(aes(y = ..density..),
                   color = "white",
                   fill = "#287D8EFF",
                   bins = 30,
                   alpha = 0.6) +
    stat_function(fun = dnorm, 
                  args = list(mean = mean(step_model_bc$residuals),
                              sd = sd(step_model_bc$residuals)),
                  size = 2, color = "grey30") +
    scale_color_manual(values = "grey50") +
    labs(x = "Resíduos",
         y = "Frequência") +
    theme_bw()

#Resumo dos dois modelos obtidos pelo procedimento Stepwise (linear e com Box-Cox) Função ‘export_summs’ do pacote ‘jtools’

export_summs(step_empresas, step_modelo_bc, model.names = c(“Modelo Linear”,“Modelo Box-Cox”), scale = F, digits = 6)

#Parâmetros reais do modelo com procedimento Stepwise e Box-Cox

confint(step_model_bc,level = 0.95)
##                    2.5 %       97.5 %
## (Intercept) 2.684060e+00 2.9710107464
## disclosure  1.136659e-03 0.0051245086
## ativos      1.167775e-05 0.0000684159
## liquidez    2.431351e-02 0.0553669978
plot_summs(step_model_bc, colors = "#287D8EFF")

#Parâmetros padronizados
plot_summs(step_model_bc, scale = TRUE, plot.distributions = TRUE,
           inner_ci_level = .95, colors = "#287D8EFF")

#### Comparando os ICs do betas dos modelos sem e com Transformação de Box-Cox

plot_summs(step_empresas, step_model_bc, scale = T, plot.distributions = TRUE,
           inner_ci_level = .95, colors = c("#440154FF", "#287D8EFF"))
## inner_ci_level is ignored when plot.distributions == TRUE and more than one
## model is used.

#### Predictions


What is the value of retorno, average, to disclosure = 50, liquidez = 14 and ativo = 4000, ceteris paribus?

predict(object = step_model_bc, 
        data.frame(disclosure = 50, 
                   liquidez = 14, 
                   ativos = 4000),
        interval = "confidence", level = 0.95)
##        fit      lwr      upr
## 1 3.702015 3.665555 3.738476
#Não podemos nos esquecer de fazer o cálculo para a obtenção do fitted
#value de Y (retorno)
(((3.702015 * -0.02256414) + 1)) ^ (1 / -0.02256414)
## [1] 47.74258

Saving the fitted values

empresas$yhat_step_empresas <- step_empresas$fitted.values
empresas$yhat_step_model_bc <- (((step_model_bc$fitted.values*(lambda_emp$lambda))+
                                    1))^(1/(lambda_emp$lambda))

View dataset model step_empresas and step_modelo_bc

empresas %>%
  select(empresa, retorno, yhat_step_empresas, yhat_step_model_bc) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = F, 
                font_size = 22)
empresa retorno yhat_step_empresas yhat_step_model_bc
Adidas 63.67 52.59046 54.07239
BASF 55.49 65.40320 66.16532
Bayer 52.42 57.32538 56.70622
BSH 54.00 60.00932 62.16105
Bosch 68.01 61.96091 64.33723
Daimler AG 64.13 66.87948 66.09250
Deutsche Bank 68.76 61.37376 62.18825
Faber-Castell 79.02 68.20958 67.71376
Group Technologies 84.64 80.27035 80.00752
Lufthansa 59.84 53.56974 51.21424
Puma 50.79 55.27195 55.26243
Siemens 47.46 49.78673 50.60726
ThyssenKrupp 36.06 42.81945 40.96082
Volkswagen 34.21 43.64627 41.24819
Opel 47.31 47.55191 43.17775
Deutsche Telekom 46.98 41.32787 37.12380
Wurth 34.54 42.12840 41.89250
DHL Express 38.82 33.91951 34.08071
BHP Billton 30.82 36.30031 35.27669
Holden 38.03 33.39690 32.58498
Qantas 31.92 36.21584 33.83915
Telstra 29.02 33.57840 32.47082
Bunge 31.04 30.41776 31.78949
Fortis 21.47 32.10339 31.33844
InBev 32.70 30.02721 31.18814
Stella Artois 32.61 26.63368 28.89393
Alpargatas 40.16 38.27633 38.07800
AmBev 42.32 45.72462 41.44602
Agrale 37.17 35.98853 34.68531
Aracruz Celulose 48.57 40.71158 41.92955
Azálea 33.66 28.64119 30.27183
Banco Itaú 56.14 52.59046 54.07239
Bematech 61.71 65.40320 66.16532
Bob’s 54.43 57.32538 56.70622
Bradesco 71.84 60.00932 62.16105
Brahma 65.68 61.96091 64.33723
Caloi 72.57 66.87948 66.09250
CSN 62.36 61.37376 62.18825
Embraco 71.09 68.20958 67.71376
Embraer 86.18 80.27035 80.00752
EMS 60.90 53.56974 51.21424
Guararapes 56.33 55.27195 55.26243
Gerdau 44.19 49.78673 50.60726
Globo 44.62 42.81945 40.96082
Gradiente 38.49 43.64627 41.24819
Grupo Edson Queiroz 42.04 47.55191 43.17775
H. Stern 29.53 41.32787 37.12380
Klabin 51.60 42.12840 41.89250
Marcopolo 30.57 33.91951 34.08071
Natura 37.93 36.30031 35.27669
O Boticário 38.25 33.39690 32.58498
Odebrecht 27.52 36.21584 33.83915
Perdigão 39.04 33.57840 32.47082
Petrobras 37.86 30.41776 31.78949
Randon 29.95 32.10339 31.33844
Record 28.46 30.02721 31.18814
Sadia 21.88 26.63368 28.89393
Itaú Unibanco Banco 37.94 38.27633 38.07800
Tramontina 44.96 45.72462 41.44602
Tecnologia Automotiv 46.96 35.98853 34.68531
Vale 36.38 40.71158 41.92955
Votorantim 29.80 28.64119 30.27183
WEG 61.55 52.59046 54.07239
ATI Technologies 70.02 65.40320 66.16532
Bombardier 60.61 57.32538 56.70622
Celestica 52.27 60.00932 62.16105
Cogeco 69.97 61.96091 64.33723
Cognos 57.66 66.87948 66.09250
Nortel 63.92 61.37376 62.18825
Lenovo 60.61 68.20958 67.71376
Sinopec 81.56 80.27035 80.00752
Daewoo 46.41 53.56974 51.21424
Hyundai 63.13 55.27195 55.26243
Kia 37.25 49.78673 50.60726
LG 46.30 42.81945 40.96082
Samsung 45.49 43.64627 41.24819
Ssangyong 36.42 47.55191 43.17775
Lego 31.09 41.32787 37.12380
Maersk 49.28 42.12840 41.89250
Novozymes 33.01 33.91951 34.08071
Atento 36.01 36.30031 35.27669
BBVA 32.14 33.39690 32.58498
Gamesa 38.91 36.21584 33.83915
Iberia Linhas Aéreas 24.89 33.57840 32.47082
Iberdrola 32.19 30.41776 31.78949
Inditex 25.58 32.10339 31.33844
Mapfre 35.26 30.02721 31.18814
Santander 29.29 26.63368 28.89393
Telefónica 40.15 38.27633 38.07800
Zara 55.74 45.72462 41.44602
Repsol 47.02 35.98853 34.68531
3M 46.13 40.71158 41.92955
Altria 32.41 28.64119 30.27183
American Express 54.74 52.59046 54.07239
American Airlines 61.72 65.40320 66.16532
AMD 65.44 57.32538 56.70622
AOL 69.77 60.00932 62.16105
Apple 60.20 61.96091 64.33723
AT&T 64.32 66.87948 66.09250
Avon 69.95 61.37376 62.18825
BankBoston 60.26 68.20958 67.71376
Bank of America 72.85 80.27035 80.00752
BellSouth 44.28 53.56974 51.21424
Blockbuster 48.15 55.27195 55.26243
Boeing 42.14 49.78673 50.60726
Burger King 30.55 42.81945 40.96082
Caterpillar 35.07 43.64627 41.24819
ChevronTexaco 41.78 47.55191 43.17775
Chrysler 43.50 41.32787 37.12380
Cinemark 33.64 42.12840 41.89250
Cisco Systems 26.29 33.91951 34.08071
Colgate-Palmolive 45.00 36.30031 35.27669
Citigroup 35.31 33.39690 32.58498
Coca-Cola 29.11 36.21584 33.83915
CSC 40.27 33.57840 32.47082
Dell 29.75 30.41776 31.78949
Delta Air Lines 26.19 32.10339 31.33844
Deloitte 33.90 30.02721 31.18814
Dow Corning 21.03 26.63368 28.89393
DuPont 39.85 38.27633 38.07800
ExxonMobil 44.16 45.72462 41.44602
FedEx 45.38 35.98853 34.68531
Ford Motors 45.25 40.71158 41.92955
General Electric 33.28 28.64119 30.27183

Predicted Model Adjust dos modelos: Predicted Values (fitted values) vs. Actual Values

empresas %>%
  ggplot() +
  geom_smooth(aes(x = retorno, y = yhat_step_empresas, color = "Stepwise"),
              method = "lm", se = F, formula = y ~ splines::bs(x, df = 5), size = 1.5) +
  geom_point(aes(x = retorno, y = yhat_step_empresas),
             color = "lightpink3", alpha = 0.6, size = 2) +
  geom_smooth(aes(x = retorno, y = yhat_step_model_bc, color = "Stepwise Box-Cox"),
              method = "lm", se = F, formula = y ~ splines::bs(x, df = 5), size = 1.5) +
  geom_point(aes(x = retorno, y = yhat_step_model_bc),
             color = "blue", alpha = 0.6, size = 2) +
  geom_smooth(aes(x = retorno, y = retorno), method = "lm", formula = y ~ x,
              color = "grey30", size = 1.05,
              linetype = "longdash") +
  scale_color_manual("Modelos:", 
                     values = c("lightpink3", "blue")) +
  labs(x = "Retorno", y = "Fitted Values") +
  theme(panel.background = element_rect("white"),
        panel.grid = element_line("grey95"),
        panel.border = element_rect(NA),
        legend.position = "bottom")


Even that the difference between R2 models is small, the non-linear model(blue line) provides a better construction of ICs to predict the behavior of Y, than a linear model (lightpink3).